{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#引入jieba模块\n",
    "import jieba"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "基础分词、添加自定义词\n",
    "---"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "全模式： 上海/自来/自来水/来自/海上///所以/吃/葡萄/不/吐/葡萄/皮\n",
      "精确模式： 上海/自来水/来自/海上/，/所以/吃/葡萄/不吐/葡萄/皮\n",
      "搜索引擎模式 上海/自来/自来水/来自/海上/，/所以/吃/葡萄/不吐/葡萄/皮\n"
     ]
    }
   ],
   "source": [
    "words_a='上海自来水来自海上，所以吃葡萄不吐葡萄皮'\n",
    "seg_a=jieba.cut(words_a,cut_all=True)\n",
    "print(\"全模式：\",\"/\".join(seg_a))\n",
    "seg_b=jieba.cut(words_a,cut_all=False)\n",
    "print(\"精确模式：\",\"/\".join(seg_b))\n",
    "seg_c=jieba.cut_for_search(words_a)\n",
    "print(\"搜索引擎模式\",\"/\".join(seg_c))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "for w in seg_b:\n",
    "    print(w)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "自定义前： 我/为/机器/学习/疯狂/打/call\n",
      "加入‘打call’后： 我/为/机器/学习/疯狂/打call\n"
     ]
    }
   ],
   "source": [
    "#添加和删除自定义词汇\n",
    "words_a1='我为机器学习疯狂打call'\n",
    "print(\"自定义前：\",\"/\".join(jieba.cut(words_a1)))\n",
    "jieba.del_word(\"打call\")\n",
    "jieba.add_word(\"打call\")\n",
    "print(\"加入‘打call’后：\",\"/\".join(jieba.cut(words_a1)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "加载自定义词库前： 在/正义者/联盟/的/电影/里/，/嘻哈侠/和/蝙蝠侠/联手/打败/了/大/boss/，/我/高喊/666/，/为/他们/疯狂/打/call\n",
      "--------VS--------\n",
      "加载自定义词库后： 在/正义者联盟/的/电影/里/，/嘻哈侠/和/蝙蝠侠/联手/打败/了/大/boss/，/我/高喊/666/，/为/他们/疯狂/打call\n"
     ]
    }
   ],
   "source": [
    "#导入自定义词典\n",
    "jieba.del_word(\"打call\")#删除之前添加的词汇\n",
    "words_a2='在正义者联盟的电影里，嘻哈侠和蝙蝠侠联手打败了大boss，我高喊666，为他们疯狂打call'\n",
    "print(\"加载自定义词库前：\",\"/\".join(jieba.cut(words_a2)))\n",
    "jieba.load_userdict(\"./datas/01mydict.txt\")\n",
    "print(\"--------VS--------\")\n",
    "print(\"加载自定义词库后：\",\"/\".join(jieba.cut(words_a2)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['在', '正义者联盟', '的', '电影', '里', '，', '嘻哈侠', '和', '蝙蝠侠', '联手', '打败', '了', '大', 'boss', '，', '我', '高喊', '666', '，', '为', '他们', '疯狂', '打call']\n",
      "['在', '正义者联盟', '的', '电影', '里', '，', '嘻哈侠', '和', '蝙蝠侠', '联手', '打败', '了', '大', 'boss', '，', '我', '高喊', '666', '，', '为', '他们', '疯狂', '打call']\n"
     ]
    }
   ],
   "source": [
    "#获得切词后的数据\n",
    "ls1=[]\n",
    "for item in jieba.cut(words_a2):\n",
    "    ls1.append(item)\n",
    "print(ls1)\n",
    "##用lcut直接获得切词后的数据\n",
    "ls2=jieba.lcut(words_a2)\n",
    "print(ls2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "如果/放到/post/中将/出错/。\n",
      "如果/放到/post/中/将/出错/。\n",
      "「/台/中/」/正确/应该/不会/被/切开\n",
      "「/台中/」/正确/应该/不会/被/切开\n"
     ]
    }
   ],
   "source": [
    "#调整词典，关闭HMM发现新词功能\n",
    "print('/'.join(jieba.cut('如果放到post中将出错。', HMM=False)))\n",
    "jieba.suggest_freq(('中', '将'), True)\n",
    "print('/'.join(jieba.cut('如果放到post中将出错。', HMM=False)))\n",
    "print('/'.join(jieba.cut('「台中」正确应该不会被切开', HMM=False)))\n",
    "jieba.suggest_freq('台中', True)\n",
    "print('/'.join(jieba.cut('「台中」正确应该不会被切开', HMM=False)))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('蝙蝠侠', 1.1583898043333334),\n",
       " ('正义者联盟', 0.9962306252416666),\n",
       " ('嘻哈侠', 0.9962306252416666),\n",
       " ('boss', 0.9962306252416666),\n",
       " ('666', 0.9962306252416666),\n",
       " ('打call', 0.9962306252416666),\n",
       " ('高喊', 0.7754631501533332),\n",
       " ('联手', 0.6396766018916666),\n",
       " ('打败', 0.6287141659400001),\n",
       " ('疯狂', 0.5589159861316667),\n",
       " ('电影', 0.554823310765),\n",
       " ('他们', 0.2925248854975)]"
      ]
     },
     "execution_count": 48,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import jieba.analyse\n",
    "jieba.analyse.extract_tags(words_a2,topK=20,withWeight=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#http://www.sogou.com/labs/webservice/  搜狗实验室，自动生成词性"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
